{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "gensim_LDA_corpus_creation.ipynb",
      "provenance": [],
      "toc_visible": true,
      "mount_file_id": "1vxPpxYqAf5N8rwOH491Fdj_aIj8fEcN0",
      "authorship_tag": "ABX9TyMvvJqWYGI/boZsNe8dfgLi",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/monsund/unsupervised-classification-of-Linkedin-Profiles-using-KMeans-LDA-TFIDF/blob/master/gensim_LDA_corpus_creation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kbQlTWfvlW6r"
      },
      "source": [
        "# **Importing Gesim Libraries**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ly6Cjoe88rdz"
      },
      "source": [
        "import gensim\n",
        "import gensim.corpora as corpora\n",
        "from gensim.utils import simple_preprocess\n",
        "from gensim.models import CoherenceModel"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iqxzA_r19Jtf"
      },
      "source": [
        "# **Other Libraries**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2h5wZw3V9O7g",
        "outputId": "726d3771-3100-4d42-ef13-77eddc0e5037",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "import spacy\n",
        "import nltk \n",
        "nltk.download('stopwords')\n",
        "import pandas as pd"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GOPQ7nl8YRfP",
        "outputId": "7b5f9dca-6178-468c-9816-4bba928ffdd7",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 187
        }
      },
      "source": [
        "!pip show spacy"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Name: spacy\n",
            "Version: 2.2.4\n",
            "Summary: Industrial-strength Natural Language Processing (NLP) in Python\n",
            "Home-page: https://spacy.io\n",
            "Author: Explosion\n",
            "Author-email: contact@explosion.ai\n",
            "License: MIT\n",
            "Location: /usr/local/lib/python3.6/dist-packages\n",
            "Requires: thinc, srsly, blis, preshed, numpy, catalogue, tqdm, requests, plac, murmurhash, setuptools, cymem, wasabi\n",
            "Required-by: fastai, en-core-web-sm\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "44tfQKaG9mDO"
      },
      "source": [
        "# **Import and Read File**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0zUq4lXg9QQE"
      },
      "source": [
        "df = pd.read_csv('/content/drive/My Drive/NLP/with_swaraj/Data/linkedin_about_no_foreign_lang.csv')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m3oJYYWy9lKp",
        "outputId": "70651785-95e8-41d3-e4db-9a38a3420d47",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "df = df.dropna()\n",
        "df = df.drop(['cleaned_about_us', 'cluster'], axis=1)\n",
        "df.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>https://www.linkedin.com/in/lokesh-kumar-xess-...</td>\n",
              "      <td>lokesh-kumar-xess-54814068</td>\n",
              "      <td>Marketing &amp; Strategy at Mobistreak</td>\n",
              "      <td>I am a Google Adwords and Google Analytics cer...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>https://in.linkedin.com/in/sujithnarayanan</td>\n",
              "      <td>sujithnarayanan</td>\n",
              "      <td>Reimagining Financial Services for India</td>\n",
              "      <td>Ideate. Execute. Disrupt. Iterate.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>https://www.linkedin.com/in/adarshramakrishnan</td>\n",
              "      <td>adarshramakrishnan</td>\n",
              "      <td>Product Strategist | Product Manager | Triple ...</td>\n",
              "      <td>Over the past decade, I have co-founded 3 vent...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>https://www.linkedin.com/in/rhythm-bhatnagar-4...</td>\n",
              "      <td>rhythm-bhatnagar-4350b551</td>\n",
              "      <td>Product Marketer | Ex-SHEROES | Early-Stage St...</td>\n",
              "      <td>Hey,I am Rhythm. I am passionate about startup...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>https://www.linkedin.com/in/karthiksureshlbs</td>\n",
              "      <td>karthiksureshlbs</td>\n",
              "      <td>Product at Facebook | CMU MS | LBS MBA</td>\n",
              "      <td>I am a product-centric builder with experience...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                 url  ...                                              about\n",
              "0  https://www.linkedin.com/in/lokesh-kumar-xess-...  ...  I am a Google Adwords and Google Analytics cer...\n",
              "1         https://in.linkedin.com/in/sujithnarayanan  ...                 Ideate. Execute. Disrupt. Iterate.\n",
              "2     https://www.linkedin.com/in/adarshramakrishnan  ...  Over the past decade, I have co-founded 3 vent...\n",
              "3  https://www.linkedin.com/in/rhythm-bhatnagar-4...  ...  Hey,I am Rhythm. I am passionate about startup...\n",
              "4       https://www.linkedin.com/in/karthiksureshlbs  ...  I am a product-centric builder with experience...\n",
              "\n",
              "[5 rows x 4 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "w-mezAedYt-u"
      },
      "source": [
        "# **Remove Punctuation**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PslC-zpLYQsq"
      },
      "source": [
        "# define punctuation\n",
        "punctuations = '\\'!()-[]{};:\\\"\\,<>./?#$%^&*_~'"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1B9BqylmaLmg"
      },
      "source": [
        "#Function for removing punctuation and lowering case\n",
        "def remove_punct(text):\n",
        "  for char in text:\n",
        "    if char in punctuations:\n",
        "      text = text.lower().replace(char,' ')    #lowering case\n",
        "      text = text.replace('  ', ' ')\n",
        "  return text"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eBikD64SanB3",
        "outputId": "b38d7063-a8bc-43af-8fdc-223985995c42",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 176
        }
      },
      "source": [
        "df['about_cleaned']= df.about.apply(lambda x: remove_punct(x))\n",
        "df.head(3)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>about_cleaned</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>https://www.linkedin.com/in/lokesh-kumar-xess-...</td>\n",
              "      <td>lokesh-kumar-xess-54814068</td>\n",
              "      <td>Marketing &amp; Strategy at Mobistreak</td>\n",
              "      <td>I am a Google Adwords and Google Analytics cer...</td>\n",
              "      <td>i am a google adwords and google analytics cer...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>https://in.linkedin.com/in/sujithnarayanan</td>\n",
              "      <td>sujithnarayanan</td>\n",
              "      <td>Reimagining Financial Services for India</td>\n",
              "      <td>Ideate. Execute. Disrupt. Iterate.</td>\n",
              "      <td>ideate execute disrupt iterate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>https://www.linkedin.com/in/adarshramakrishnan</td>\n",
              "      <td>adarshramakrishnan</td>\n",
              "      <td>Product Strategist | Product Manager | Triple ...</td>\n",
              "      <td>Over the past decade, I have co-founded 3 vent...</td>\n",
              "      <td>over the past decade i have co founded 3 ventu...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                 url  ...                                      about_cleaned\n",
              "0  https://www.linkedin.com/in/lokesh-kumar-xess-...  ...  i am a google adwords and google analytics cer...\n",
              "1         https://in.linkedin.com/in/sujithnarayanan  ...                    ideate execute disrupt iterate \n",
              "2     https://www.linkedin.com/in/adarshramakrishnan  ...  over the past decade i have co founded 3 ventu...\n",
              "\n",
              "[3 rows x 5 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "828lcEGxpBAJ"
      },
      "source": [
        "# **Remove Stopwords**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BY3d-ytMowtT",
        "outputId": "0f33d353-7548-4dc0-a43b-24c20aeda79f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 85
        }
      },
      "source": [
        "import nltk\n",
        "nltk.download('punkt')\n",
        "from nltk.corpus import stopwords\n",
        "nltk.download('stopwords')\n",
        "from nltk.tokenize import word_tokenize"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yd0xuAYTpzSJ"
      },
      "source": [
        "stop_words = set(stopwords.words('english'))\n",
        "additional_stop_words = set(['technology', 'development', 'experience', 'project', 'company'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NCRfTHrX7Txx"
      },
      "source": [
        "stop_words = stop_words.union(additional_stop_words)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lMGSaDrdpO95"
      },
      "source": [
        "# Function removing stopwords-----------\n",
        "def remove_stopwords(text):\n",
        "  word_tokens = word_tokenize(text)\n",
        "  filtered_text = [w for w in word_tokens if not w in stop_words]\n",
        "  data = (' '.join(filtered_text))\n",
        "  return data"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ze5URi9hqOAi"
      },
      "source": [
        "df['about_cleaned'] = df.about_cleaned.apply(lambda x: remove_stopwords(x))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "euHoGSKU8XSr"
      },
      "source": [
        "# **Tokenize words and Clean-up text**\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J9gaBaWdC5p9"
      },
      "source": [
        "Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.\n",
        "\n",
        "Gensim’s ***simple_preprocess()*** is great for this. Additionally I have set **deacc=True** to remove the punctuations."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zfNOOZu6D3NB"
      },
      "source": [
        "# Gensim\n",
        "import gensim\n",
        "# import gensim.corpora as corpora\n",
        "from gensim.utils import simple_preprocess\n",
        "# from gensim.models import CoherenceModel"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Bqlusuvb8qLr"
      },
      "source": [
        "def sent_to_words(sentences):\n",
        "    for sentence in sentences:\n",
        "        yield(gensim.utils.simple_preprocess(str(sentence), min_len=2, deacc=True))  # deacc=True removes punctuations"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GpnN2_slsKgB",
        "outputId": "231833ad-8bf7-46c3-bd17-72dcc338db20",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        }
      },
      "source": [
        "data_words= list(sent_to_words(df.about_cleaned))\n",
        "print(data_words[0:3])\n",
        "# when entire 'data_words' is printed it gives error saying 'IOPub data rate exceeded' because of large amount of data."
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[['google', 'adwords', 'google', 'analytics', 'certified', 'ppc', 'expert', 'five', 'years', 'areas', 'search', 'engine', 'marketing', 'seo', 'ppc', 'website', 'speed', 'optimization', 'email', 'marketing', 'currently', 'working', 'manager', 'media', 'services', 'los', 'angeles', 'based', 'marketing', 'firm', 'mobistreak', 'inc', 'manage', 'monthly', 'adwords', 'spending', 'firm', 'portfolio', 'includes', 'cable', 'internet', 'dentist', 'auto', 'insurance', 'sr', 'insurance', 'health', 'insurance', 'mortgage', 'hotel', 'flight', 'booking', 'home', 'improvement', 'flower', 'delivery', 'industry', 'strong', 'ability', 'develop', 'understand', 'marketing', 'strategies', 'big', 'picture', 'campaign', 'level', 'feel', 'free', 'reach', 'interested', 'taking', 'business', 'next', 'level', 'effective', 'digital', 'advertising', 'strategy'], ['ideate', 'execute', 'disrupt', 'iterate'], ['past', 'decade', 'co', 'founded', 'ventures', 'elegant', 'solutions', 'inseyete', 'startship', 'provided', 'end', 'end', 'product', 'consulting', 'services', 'design', 'management', 'strategy', 'clients', 'world', 'wide', 'served', 'head', 'product', 'archbolt', 'also', 'built', 'side', 'career', 'instructor', 'higher', 'education', 'teaching', 'courses', 'leadership', 'negotiations', 'products', 'university', 'virginia', 'virginia', 'commonwealth', 'university', 'connect', 'products', 'design']]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n9mskXUwJThi"
      },
      "source": [
        "# **Creating Bigram and Trigram**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BWNmyX1oJYYq"
      },
      "source": [
        "# Build the bigram and trigram models\n",
        "bigram = gensim.models.Phrases(data_words, min_count=1, threshold=70) # higher threshold fewer phrases.\n",
        "# trigram = gensim.models.Phrases(bigram[data_words], threshold=30) "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "g3qm-VLMJfOM"
      },
      "source": [
        "# Faster way to get a sentence clubbed as a trigram/bigram\n",
        "bigram_mod = gensim.models.phrases.Phraser(bigram)\n",
        "# trigram_mod = gensim.models.phrases.Phraser(trigram)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xdVsihwqJkfS"
      },
      "source": [
        "# # See trigram example\n",
        "# # print(trigram_mod[bigram_mod[data_words[0]]])\n",
        "# for i in range(1000):\n",
        "#   print(bigram_mod[data_words[i]])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eNDVQWaZEx9K"
      },
      "source": [
        "# **Create function for Making Bigram, Trigram and Lemmatization**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fDCAhXkXJoG7"
      },
      "source": [
        "def make_bigrams(words_list):\n",
        "    return [bigram_mod[words] for words in words_list]\n",
        "\n",
        "# def make_trigrams(texts):\n",
        "#     return [trigram_mod[bigram_mod[words]] for words in words_list]\n",
        "\n",
        "def lemmatization(texts, allowed_postags=['NOUN', 'PROPN' 'ADJ', 'VERB', 'ADV']):\n",
        "    \"\"\"https://spacy.io/api/annotation\"\"\"\n",
        "    texts_out = []\n",
        "    for sent in texts:\n",
        "        doc = nlp(\" \".join(sent)) \n",
        "        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n",
        "    return texts_out"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4jF0B9KiOOBz"
      },
      "source": [
        "# Form Bigrams \n",
        "data_words_bigram = make_bigrams(data_words)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4mEFmlXbNvzD"
      },
      "source": [
        "# **Importing Spacy**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hB7DAm73NyyV"
      },
      "source": [
        "% cd /content/drive/My Drive/NLP/with_swaraj/Code\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UbRCjJHoOHkQ"
      },
      "source": [
        "!pip install en_core_web_lg-2.1.0.tar.gz"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iVHTQDKqOQOT",
        "outputId": "4bb47efc-7b6b-4ef3-c143-a4242e64677e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "import spacy\n",
        "spacy.cli.download('en_core_web_lg')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
            "You can now load the model via spacy.load('en_core_web_lg')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dIRa6XRglOLe"
      },
      "source": [
        "# Initialize spacy 'en_core_web_lg' model, keeping only tagger component (for efficiency)\n",
        "\n",
        "nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Y_d3gJ2RlZII"
      },
      "source": [
        "# Do lemmatization keeping only noun, adj, vb, adv\n",
        "data_lemmatized = lemmatization(data_words_bigram, allowed_postags=['NOUN', 'PROPN', 'ADJ', 'VERB', 'ADV'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ns-3gUdwzvfb"
      },
      "source": [
        "# **Saving data_lemmatized**"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2BBOhz_Z0C3P"
      },
      "source": [
        "data_lemmatized is a list which contains list of words, document wise"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oxblyBl2zd0k"
      },
      "source": [
        "import pickle\n",
        "\n",
        "#Saving Corpus \n",
        "with open(\"/content/drive/My Drive/NLP/with_swaraj/Data/data_list_for_lda_dictionary.json\", \"wb\") as fp:   #Pickling\n",
        "  pickle.dump(data_lemmatized, fp)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xloJAg5EPrY-"
      },
      "source": [
        "# **Create the Dictionary and Corpus needed for Topic Modeling**"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LEDIZrvOPzn3"
      },
      "source": [
        "The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uh8t-3olOhXP",
        "outputId": "f7dde204-8edc-47ca-cbb7-909413138a90",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "import gensim.corpora as corpora\n",
        "\n",
        "# Create Dictionary\n",
        "id2word = corpora.Dictionary(data_lemmatized)\n",
        "print(f'Total vocabulary size: {len(id2word)}')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Total vocabulary size: 41089\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YhKWPy0oQsXm"
      },
      "source": [
        "# Create Corpus\n",
        "texts = data_lemmatized"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-bNDOg0ZP6Ic"
      },
      "source": [
        "# Term Document Frequency\n",
        "corpus = [id2word.doc2bow(text) for text in texts]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "u1RtGYGVQt8P",
        "outputId": "7ca2698d-fe99-487c-dd34-fc39f0f95c0c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 187
        }
      },
      "source": [
        "corpus[9]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[(182, 1),\n",
              " (416, 1),\n",
              " (417, 1),\n",
              " (418, 1),\n",
              " (419, 2),\n",
              " (420, 1),\n",
              " (421, 1),\n",
              " (422, 1),\n",
              " (423, 1),\n",
              " (424, 1)]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 57
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7jo9ClMCGydE"
      },
      "source": [
        "# **Find Frequecy of words in corpus**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ca3by6dzG8QK"
      },
      "source": [
        "dict_corpus = {}\n",
        "for i in range(len(corpus)):\n",
        "  for word_id, freq in corpus[i]:\n",
        "    if id2word[word_id] in dict_corpus:\n",
        "      dict_corpus[id2word[word_id]] += freq\n",
        "    else:\n",
        "      dict_corpus[id2word[word_id]] = freq\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MTsh0Av3EyI9"
      },
      "source": [
        "# **Making an Ordered Dictionary Type**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QjDqLR2NDCdW"
      },
      "source": [
        "import collections\n",
        "od = collections.OrderedDict(sorted(dict_corpus.items()))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Z-MAgkGpE8gp"
      },
      "source": [
        "# **Creating DataFrame for word vs frequency**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QVgMDPl7ExA4"
      },
      "source": [
        "dict_corpus_list=[]\n",
        "for word, freq in od.items():\n",
        "  dic={'words':word, 'freq':freq}\n",
        "  dict_corpus_list.append(dic)\n",
        "\n",
        "dict_corpus_df = pd.DataFrame(dict_corpus_list)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5MhEaUSkEXvW"
      },
      "source": [
        "pd.set_option(\"display.max_rows\", None, \"display.max_columns\", None) # This line is for displaying entire dataframe\n",
        "dict_corpus_df.sort_values('words')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2wlZRroOImFR"
      },
      "source": [
        "# **Plotting histogram of word frequencies**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wxCb5vF1JAEc"
      },
      "source": [
        "import seaborn as sns\n",
        "import matplotlib.pyplot as plt\n",
        "# sns.set()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Goj21HgEI0MW",
        "outputId": "dbf9a05e-c811-4c15-a177-0c4036cc74c6",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 442
        }
      },
      "source": [
        "plt.figure(figsize=(8,6))\n",
        "sns.distplot(dict_corpus_df['freq'], bins=200);"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
            "  warnings.warn(msg, FutureWarning)\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAf4AAAFzCAYAAADfQWsjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3df7BkZX3n8ff3zp3hlwIyjEoYcMjCakazMTpBXV1rN0SERB13AxWIUTZLQhJls4mV3RqSDZVQWrVkq+LGkiSSYJawMWAwmIkZFyWYpOJukEFRGHDiFXGZEcOABJTwa+jv/nFO33u6p4fpc5ye7ufe96uq654+ffrM85BrPvc5z3O+JzITSZK0MsxNuwGSJOnQMfglSVpBDH5JklYQg1+SpBXE4JckaQUx+CVJWkHmp92AQ+H444/PDRs2TLsZkiQdErfddtuDmblu1GcrIvg3bNjA9u3bp90MSZIOiYj42v4+81K/JEkriMEvSdIKYvBLkrSCGPySJK0gBr8kSSuIwS9J0gpi8EuStIIY/JIkrSAGvyRJK4jBL0nSCmLwS5K0ghj8kiStIAZ/R7fe+00+s/DgtJshSVIrK+LpfJPwgZsXePSJp3ntqcdPuymSJI3NEX9Hz/SSXi+n3QxJklox+DvqZWLuS5JKY/B3lAmJyS9JKovB31Evk15v2q2QJKkdg7+jzCr8JUkqicHfUZKY+5Kk0hj8HfUc8UuSCjTR4I+IsyJiZ0QsRMSWEZ8fFhHX1Z/fEhEb6v2nR8Tt9esLEfFvxz3nodJLl/ZJksozseCPiFXAFcDZwEbg/IjYOHTYhcDDmXkq8D7g8nr/ncCmzHw5cBbwwYiYH/Och4Rz/JKkEk1yxH86sJCZ92TmU8C1wOahYzYDV9fb1wNnRERk5j9l5t56/+GwOLge55yHRKZz/JKk8kwy+E8E7mu831XvG3lMHfSPAGsBIuJVEbEDuAP42frzcc55SDjHL0kq0cwu7svMWzLzpcAPAJdExOFtvh8RF0XE9ojYvmfPnoPevp4jfklSgSYZ/LuBkxrv19f7Rh4TEfPAMcBDzQMy827g28DLxjxn/3tXZuamzNy0bt2676AboznHL0kq0SSD/1bgtIg4JSLWAOcBW4eO2QpcUG+fA9ycmVl/Zx4gIl4EvAS4d8xzHhKO+CVJJZrYY3kzc29EXAzcCKwCPpSZOyLiMmB7Zm4FrgKuiYgF4JtUQQ7wOmBLRDwN9IB3ZuaDAKPOOak+PBtH/JKkEk0s+AEycxuwbWjfpY3tJ4BzR3zvGuCacc85DVbukySVaGYX9806V/VLkkpk8HfUy6Rn7kuSCmPwd5VVER9Jkkpi8HdkrX5JUokM/o6c45cklcjg76iXSc9JfklSYQz+jjLxdj5JUnEM/o7SOX5JUoEM/o6c45cklcjg7yhJg1+SVByDv6NqxD/tVkiS1I7B31Fm4iS/JKk0Bn9HPp1PklQig7+jqla/wS9JKovB35Fz/JKkEhn8HfVH+z6oR5JUEoO/qzrvzX1JUkkM/o76I37n+SVJJTH4O+rP7zvPL0kqicHfUb9SvxX7JUklMfg76o/0r/3sfdNtiCRJLRj8HeXiqv4pN0SSpBYM/o5ycVW/yS9JKofB39HiffxTbockSW0Y/B31vI9fklQgg7+D5uV9V/VLkkpi8HfQHOU74pcklcTg76A3MOKXJKkcBn8HvYERv9EvSSqHwd9Bc17f2JcklcTg78A5fklSqQz+Dgbm+E1+SVJBDP4OzHpJUqkM/g4GR/xTbIgkSS0Z/B0MrOqfXjMkSWrN4O/C2/kkSYUy+DuwgI8kqVQGfwfO8UuSSmXwd2DlPklSqQz+DqzcJ0kqlcHfQbqqX5JUqIkGf0ScFRE7I2IhIraM+PywiLiu/vyWiNhQ739DRNwWEXfUP3+w8Z2/qs95e/16/iT7MIqV+yRJpZqf1IkjYhVwBfAGYBdwa0Rszcy7GoddCDycmadGxHnA5cCPAQ8Cb87Mr0fEy4AbgRMb33tbZm6fVNsPxFr9kqRSTXLEfzqwkJn3ZOZTwLXA5qFjNgNX19vXA2dERGTm5zPz6/X+HcAREXHYBNvairfzSZJKNcngPxG4r/F+F4Oj9oFjMnMv8AiwduiYHwU+l5lPNvb9QX2Z/1cjIkb94xFxUURsj4jte/bs+U76sY+BUb7JL0kqyEwv7ouIl1Jd/v+Zxu63Zeb3Av+qfr191Hcz88rM3JSZm9atW3dQ2zW4uM/klySVY5LBvxs4qfF+fb1v5DERMQ8cAzxUv18P3AC8IzO/0v9CZu6uf34L+DDVlMIhZQEfSVKpJhn8twKnRcQpEbEGOA/YOnTMVuCCevsc4ObMzIg4FvgLYEtmfqZ/cETMR8Tx9fZq4E3AnRPsw0iu6pcklWpiwV/P2V9MtSL/buAjmbkjIi6LiLfUh10FrI2IBeDdQP+Wv4uBU4FLh27bOwy4MSK+CNxOdcXg9ybVh/3x6XySpFJN7HY+gMzcBmwb2ndpY/sJ4NwR33sP8J79nPaVB7ON3XipX5JUpple3DerHPFLkkpl8HfgHL8kqVQGfwfW6pcklcrg78Db+SRJpTL4O7CAjySpVAZ/Bz6kR5JUKoO/g55pL0kqlMHfgav6JUmlMvg76HmpX5JUKIO/kxyxJUnS7DP4Oxgc8Rv9kqRyGPwd9HqO+CVJZTL4O2iGvQN+SVJJDP4OBlb1T7EdkiS1ZfB3kM7xS5IKZfB34EN6JEmlMvg78CE9kqRSGfwdWLlPklQqg78DL/VLkkpl8Hcw8Chek1+SVBCDv4Neb2k7TX5JUkEM/g5c3CdJKpXB34GV+yRJpTL4O0gr90mSCmXwdzDwdD6jX5JUEIO/g8GSvdNrhyRJbRn8HVjAR5JUKoO/A5/OJ0kqlcHfgZf6JUmlMvg7aC7oM/clSSUx+DsYqNznkF+SVBCDv4OeYS9JKpTB34GV+yRJpTL4O0hv55MkFcrg76DnU3klSYUy+DtIg1+SVCiDvwMr90mSSmXwdzA4xz/FhkiS1JLB30HuZ1uSpFln8HfQ6znilySVyeDvYHBVv8kvSSrHRIM/Is6KiJ0RsRARW0Z8flhEXFd/fktEbKj3vyEibouIO+qfP9j4zivr/QsR8f6IiEn2YZSec/ySpEJNLPgjYhVwBXA2sBE4PyI2Dh12IfBwZp4KvA+4vN7/IPDmzPxe4ALgmsZ3fgf4aeC0+nXWpPowDoNfklSSSY74TwcWMvOezHwKuBbYPHTMZuDqevt64IyIiMz8fGZ+vd6/AziivjpwAnB0Zv5dVkvr/xB46wT7MNLAiN9L/ZKkgkwy+E8E7mu831XvG3lMZu4FHgHWDh3zo8DnMvPJ+vhdBzgnABFxUURsj4jte/bs6dyJUXou65ckFWqmF/dFxEupLv//TNvvZuaVmbkpMzetW7fuoLarP+CfC3NfklSWSQb/buCkxvv19b6Rx0TEPHAM8FD9fj1wA/COzPxK4/j1BzjnxPUv9UeElfskSUWZZPDfCpwWEadExBrgPGDr0DFbqRbvAZwD3JyZGRHHAn8BbMnMz/QPzsz7gUcj4tX1av53AH82wT6M1A/7uYDeof7HJUn6Dkws+Os5+4uBG4G7gY9k5o6IuCwi3lIfdhWwNiIWgHcD/Vv+LgZOBS6NiNvr1/Prz94J/D6wAHwF+MSk+rA/S5f6vdYvSSrL/CRPnpnbgG1D+y5tbD8BnDvie+8B3rOfc24HXnZwW9pOf3FflfsmvySpHDO9uG9W9RYv9Yf38UuSimLwd5ADi/um3BhJklow+DtIqoV9czjFL0kqi8HfQS+zWtgXeDufJKkoBn8Hvazm9wNH/JKkshj8HWQC4Ry/JKk8Bn8HmclcVLfzOeaXJJXE4O+gP8cf+FheSVJZDP4OMiGoL/VPuzGSJLVg8HfQXNzXc8gvSSqIwd9BL5NYnOOXJKkcBn8HmUn1cEBX9UuSymLwd9Cv3BcW8JEkFcbg72BgVf+0GyNJUgtjBX9E/GlE/EhE+IcC1eK+CJ/OJ0kqz7hB/tvAjwNfjoj/FhEvnmCbZl5mdStfhCN+SVJZxgr+zLwpM98GvAK4F7gpIv5PRPxkRKyeZANnUb9yX39bkqRSjH3pPiLWAv8e+Cng88BvUf0h8KmJtGyGLc7xh5X7JEllmR/noIi4AXgxcA3w5sy8v/7ouojYPqnGzarFyn14I78kqSxjBT/we5m5rbkjIg7LzCczc9ME2jXTegNz/A75JUnlGPdS/3tG7Pu/B7MhJclM5ubwIT2SpOI864g/Il4InAgcERHfD4vXto8Gjpxw22ZWL7O6zO8cvySpMAe61P9GqgV964HfbOz/FvDLE2rTzOtX7sv0Ur8kqSzPGvyZeTVwdUT8aGZ+9BC1aeb1n87XIx3xS5KKcqBL/T+Rmf8L2BAR7x7+PDN/c8TXlr1ef1l/huN9SVJRDnSp/6j653Mm3ZCi1CP+jLSAjySpKAe61P/B+uevH5rmlKFXV+7rBfR6026NJEnjG/chPb8REUdHxOqI+MuI2BMRPzHpxs2qfuW+ObzUL0kqy7j38Z+ZmY8Cb6Kq1X8q8J8n1ahZt3h1P6zVL0kqy7jB358S+BHgTzLzkQm1pwj9Vf2BT+eTJJVl3JK9H4+ILwGPAz8XEeuAJybXrNm2WLmvZwEfSVJZxn0s7xbgXwKbMvNp4DFg8yQbNsuS6gE9PqRHklSacUf8AC+hup+/+Z0/PMjtKUJ/VX84xy9JKsy4j+W9BvhnwO3AM/XuZMUGf/10PpzjlySVZdwR/yZgYzq8BapRfkQV/v4XkSSVZNxV/XcCL5xkQ0qS/VX94UN6JEllGXfEfzxwV0R8FniyvzMz3zKRVs24/hw/uKpfklSWcYP/1ybZiNL0Mqs1/V7qlyQVZqzgz8y/jogXAadl5k0RcSSwarJNm12Z1Yr+anGfyS9JKse4tfp/Grge+GC960TgY5Nq1KwbmOM39yVJBRl3cd+7gNcCjwJk5peB50+qUbOu16/ch7fzSZLKMm7wP5mZT/Xf1EV8Dph5EXFWROyMiIWI2DLi88Mi4rr681siYkO9f21EfDoivh0RHxj6zl/V57y9fh3yP0AWK/dFWMBHklSUcYP/ryPil4EjIuINwJ8Af/5sX4iIVcAVwNnARuD8iNg4dNiFwMOZeSrwPuDyev8TwK8Cv7Sf078tM19evx4Ysw8HTa9/Hz+O+CVJZRk3+LcAe4A7gJ8BtgH/9QDfOR1YyMx76qsF17Jvff/NwNX19vXAGRERmflYZv4tM/ogoMWn81U38kuSVIxxV/X3IuJjwMcyc8+Y5z4RuK/xfhfwqv0dk5l7I+IRYC3w4AHO/QcR8QzwUeA9oyoKRsRFwEUAJ5988phNHs9S5T5zX5JUlmcd8Ufl1yLiQWAnsDMi9kTEpYemeSO9LTO/F/hX9evtow7KzCszc1Nmblq3bt1BbcDiqn58SI8kqSwHutT/i1Sr+X8gM4/LzOOoRu2vjYhfPMB3dwMnNd6vr/eNPKZeMHgM8NCznTQzd9c/vwV8mGpK4ZAafDrfof7XJUnq7kDB/3bg/Mz8an9HZt4D/ATwjgN891bgtIg4JSLWAOcBW4eO2QpcUG+fA9z8bA8Cioj5iDi+3l4NvInqOQKHVC+hXtfvpX5JUlEONMe/OjP3mW/PzD118O5XPWd/MXAjVZW/D2Xmjoi4DNiemVuBq4BrImIB+CbVHwcARMS9wNHAmoh4K3Am8DXgxvrfXgXcBPzeeF09eHJgxG/0S5LKcaDgf6rjZwBk5jaqOwCa+y5tbD8BnLuf727Yz2lfeaB/d9L6c/zg4j5JUlkOFPzfFxGPjtgfwOETaE8RFiv3+ZAeSVJhnjX4M3PFPojn2SxV7vMhPZKksoxbwEcN/cp9c7iqX5JUFoO/A+f4JUmlMvg7WKzV70N6JEmFMfg7WKzcF9NuiSRJ7Rj8HSw9nc9V/ZKkshj8HWQ2VvUb/JKkghj8HSxW7sPb+SRJZTH4O+g15vgd8UuSSmLwdzBQuW/ajZEkqQWDv4Mq7JeW9HtLnySpFAZ/B82n80H/Mb2SJM0+g7+DxTn+etTfc8QvSSqEwd/BYq3+esRv7kuSSmHwd9Cs1Q+O+CVJ5TD4O2jW6gdH/JKkchj8HSxW7qvfO+KXJJXC4O9geFW/sS9JKoXB30EvYW4uFi/1O+KXJJXC4O+gl0mwVMIne9NsjSRJ4zP4O0iqhX1LBXwc8UuSymDwd9B8Oh84xy9JKofB38HS0/mc45cklcXg7yDr+/j7DH5JUikM/g56OTjHb+5Lkkph8LfUfwTvXMAcVu6TJJXF4G+p/wjewFX9kqTyGPwtNUf8fQa/JKkUBn9L/RF/s3KfuS9JKoXB31JzdO/iPklSaQz+jubCp/NJkspj8LfUa8zxu7hPklQag7+lxTn+CPpj/p65L0kqhMHfUn9VfzRG/FbrlySVwuBvafE+/oFa/VNskCRJLRj8LXkfvySpZAZ/S0uV+5Yey9vrTas1kiS1Y/C3tDjin4vFUX86xy9JKoTB39KoOX6v9EuSSmHwt7S4qr+xzzl+SVIpJhr8EXFWROyMiIWI2DLi88Mi4rr681siYkO9f21EfDoivh0RHxj6zisj4o76O++PiBg+7yT1I34umk/nO5QtkCSpu4kFf0SsAq4AzgY2AudHxMahwy4EHs7MU4H3AZfX+58AfhX4pRGn/h3gp4HT6tdZB7/1+zdQuY/+pX6TX5JUhkmO+E8HFjLznsx8CrgW2Dx0zGbg6nr7euCMiIjMfCwz/5bqD4BFEXECcHRm/l1WafuHwFsn2Id9DFTuc8QvSSrMJIP/ROC+xvtd9b6Rx2TmXuARYO0BzrnrAOcEICIuiojtEbF9z549LZu+f4uj+1ia53fEL0kqxbJd3JeZV2bmpszctG7duoN43urnnJX7JEkFmmTw7wZOarxfX+8beUxEzAPHAA8d4JzrD3DOifLpfJKkkk0y+G8FTouIUyJiDXAesHXomK3ABfX2OcDN+SzXzTPzfuDRiHh1vZr/HcCfHfym79/SffzNS/2HsgWSJHU3P6kTZ+beiLgYuBFYBXwoM3dExGXA9szcClwFXBMRC8A3qf44ACAi7gWOBtZExFuBMzPzLuCdwP8EjgA+Ub8OmaVa/c0CPia/JKkMEwt+gMzcBmwb2ndpY/sJ4Nz9fHfDfvZvB1528FrZTrNy3/A+SZJm3bJd3Dcpzcp9c87xS5IKY/C3NFC5b2ifJEmzzuBvaXBVfwzskyRp1hn8LfV61c/mEwJc3CdJKoXB31LWF/ajWbK3N8UGSZLUgsHf0qjKfY73JUmlMPhbGnw63+A+SZJmncHf0kDlvjr5neOXJJXC4G9p8T7+CPo39FnAR5JUCoO/pd7AHH+17YBfklQKg7+lZuU+5/glSaUx+FtqVu7Dkr2SpMIY/C31ekur+ufoP51vmi2SJGl8Bn9Liwv5mqv6vZNfklQIg7+lfsg3C/hYuU+SVAqDv6Vm5b4+5/glSaUw+FsafDpftc/clySVwuBvKZuV+/r7nOOXJBXC4G+p16zcF1bukySVxeBvaeDpfPU+5/glSaUw+FvqNSv3LRbwmV57JElqw+BvaWDE7+o+SVJhDP6Wlub4m7X6p9ceSZLaMPhb6jVX9VurX5JUGIO/tUblPlzVL0kqi8HfUm9gjr/aTkf8kqRCGPwtDVTuq/eZ+5KkUhj8LQ1U7lss4GPyS5LKYPC31Kzct7RvWq2RJKkdg7+lHDHH74hfklQKg7+lUZX7JEkqhcHf0mCt/nqO32v9kqRCGPwtDVTus1a/JKkwBn9LA6v6633O8UuSSmHwt5TNyn31kN/YlySVwuBvqVm5D6pRv5X7JEmlMPhbas7x9396qV+SVAqDv6XmHD9AEC7ukyQVw+BvKXNpjh+qPwAc8EuSSmHwt7TPHH84xy9JKsdEgz8izoqInRGxEBFbRnx+WERcV39+S0RsaHx2Sb1/Z0S8sbH/3oi4IyJuj4jtk2z/KM3KfcP7JEmadfOTOnFErAKuAN4A7AJujYitmXlX47ALgYcz89SIOA+4HPixiNgInAe8FPgu4KaI+OeZ+Uz9vX+TmQ9Oqu3PJvcZ8TvHL0kqxyRH/KcDC5l5T2Y+BVwLbB46ZjNwdb19PXBGVDfHbwauzcwnM/OrwEJ9vqlbHPHX/+Wq2/mm1x5JktqYZPCfCNzXeL+r3jfymMzcCzwCrD3AdxP4ZETcFhEXTaDdz2pxVX/93tv5JEklmdil/gl6XWbujojnA5+KiC9l5t8MH1T/UXARwMknn3zQ/vFm5T6obudzcZ8kqRSTHPHvBk5qvF9f7xt5TETMA8cADz3bdzOz//MB4Ab2MwWQmVdm5qbM3LRu3brvuDN9o1b1O8cvSSrFJIP/VuC0iDglItZQLdbbOnTMVuCCevsc4Oashs9bgfPqVf+nAKcBn42IoyLiuQARcRRwJnDnBPuwj30q97F0FUCSpFk3sUv9mbk3Ii4GbgRWAR/KzB0RcRmwPTO3AlcB10TEAvBNqj8OqI/7CHAXsBd4V2Y+ExEvAG6oH44zD3w4M//3pPowul/Vz6WSva7qlySVY6Jz/Jm5Ddg2tO/SxvYTwLn7+e57gfcO7bsH+L6D39Lx7VO5Dwv4SJLKYeW+lkbO8fem2CBJklow+FsartwXEc7xS5KKYfC3tO/T+VzVL0kqh8HfUmYSUY30wQI+kqSyGPwt9XLwAT1gyV5JUjkM/paSXFzYB/Ucv8kvSSqEwd9SLxkMfpzjlySVw+BvqZeD1/qrAj4mvySpDAZ/WwlzzeDHOX5JUjkM/pZ6OTzHb61+SVI5DP6W9pnjt3KfJKkgBn9LvcyB2/kC5/glSeUw+FvKXKraB/0CPtNrjyRJbRj8LWUmc3ODt/N5H78kqRQGf0vDlfuqh/RIklQGg7+lfSv3WatfklQOg7+lXi49oKe5T5KkEhj8LfWfztfnHL8kqSQGf0s5XLkvwsp9kqRiGPwtjarc5xy/JKkUBn9L+6zqx+CXJJXD4G8phxb3VU/nm2KDJElqweBvqSrgs/Q+AG/klySVwuBvyTl+SVLJDP6W9p3j9yE9kqRyGPwtJSMey2vuS5IKYfC31Bsu4BNO8UuSymHwt1RV7ms+nS+s3CdJKobB39Jw5T5wcZ8kqRwGf0sjV/X3ptggSZJaMPhbGl7IFxHO8UuSimHwt1Rd6l8a8a9ZFTz6+NNTbJEkSeMz+Fsartz3gqMPZ/c/Ps6jTxj+kqTZZ/C3NDzHf8IxhwPwpfu/Na0mSZI0NoO/peHKfS885ggA7r7/0ek0SJKkFgz+lpLBp/Mdffg8xx65mi99w+CXJM0+g7+lzBy4jz8i+J4XHs1dXuqXJBXA4G+pN1S5D+B7Tjiav//Gt3jGov2SpBln8Lc0qnLfS054Lo8//Qxfe+ix6TRKkqQxGfwtjRrxbzzhaADu9nK/JGnGGfwtDa/qBzj1+c9h1Vy4wE+SNPMmGvwRcVZE7IyIhYjYMuLzwyLiuvrzWyJiQ+OzS+r9OyPijeOec+KGKvcB/OnndrP2qDXe0idJmnkTC/6IWAVcAZwNbATOj4iNQ4ddCDycmacC7wMur7+7ETgPeClwFvDbEbFqzHNO1IvWHsmG448cuf+mux/gF6+73bl+SdLMmp/guU8HFjLzHoCIuBbYDNzVOGYz8Gv19vXAB6KaQN8MXJuZTwJfjYiF+nyMcc6J+u/nft/I/T/8shM4cs08f/6Fr3PD53ez/nlH8LLvOobjnrOG445cw7FHruZ5R67huKPWcPQR86xZtYrV88HqVXOsnptj9XwwPzdH/2JCsFQvoNru7186oHks9fFL20vHx8Dth4PniRj8/j7/XgxPbEiSSjbJ4D8RuK/xfhfwqv0dk5l7I+IRYG29/++GvntivX2gc07FYatX8caXvpBXnXIcO77+KPc+9Bi3fe1h/umpvTz+9DP7PNWvRPv7o6T/2cAfJft8J/b5Q0WSVHn7azaw5eyXHJJ/a5LBP1URcRFwUf322xGxcwL/zPHAgxM47yyxj8vDSugjrIx+2sflYaCPl9Svg+hF+/tgksG/Gzip8X59vW/UMbsiYh44BnjoAN890DkByMwrgSu7Nn4cEbE9MzdN8t+YNvu4PKyEPsLK6Kd9XB6m2cdJruq/FTgtIk6JiDVUi/W2Dh2zFbig3j4HuDkzs95/Xr3q/xTgNOCzY55TkiTtx8RG/PWc/cXAjcAq4EOZuSMiLgO2Z+ZW4Crgmnrx3jepgpz6uI9QLdrbC7wrM58BGHXOSfVBkqTlZqJz/Jm5Ddg2tO/SxvYTwLn7+e57gfeOc84pmuhUwoywj8vDSugjrIx+2sflYWp9jOrKuiRJWgks2StJ0gpi8Hcw9bLB36GI+FBEPBARdzb2HRcRn4qIL9c/n1fvj4h4f93XL0bEKxrfuaA+/ssRccGof2saIuKkiPh0RNwVETsi4j/V+5dNHwEi4vCI+GxEfKHu56/X+0+pS2Av1CWx19T7W5fInhV15c7PR8TH6/fLqo8RcW9E3BERt0fE9nrfcvt9PTYiro+IL0XE3RHxmuXUx4h4cf1/v/7r0Yj4hZnsY2b6avGiWlT4FeC7gTXAF4CN025Xyz68HngFcGdj328AW+rtLcDl9fYPA5+gqrvzauCWev9xwD31z+fV28+bdt/qtp0AvKLefi7w91QlnpdNH+v2BfCcens1cEvd/o8A59X7fxf4uXr7ncDv1tvnAdfV2xvr3+PDgFPq3+9V0+7fUF/fDXwY+Hj9fln1EbgXOH5o33L7fb0a+Kl6ew1w7HLrY6Ovq4BvUN1LP3N9nPp/oNJewGuAGxvvLwEumXa7OvRjA4PBvxM4od4+AdhZb38QOH/4OOB84ION/QPHzdIL+DPgDcu8j0cCn6OqZPkgMF/vX/x9pbob5jX19nx9XAz/DjePm4UXVb2OvwR+EPh43ebl1sd72Tf4l83vK1WNlq9Srytbjn0c6teZwGdmtY9e6m9vVCniE/dzbElekJn319vfAF5Qb++vv0X8d6gv9X4/1Wh42fWxvgR+O/AA8Cmqkew/Zube+pDCZRoAAAPMSURBVJBmmwdKZAPNEtmz3M//AfwXoFe/X8vy62MCn4yI26KqOgrL6/f1FGAP8Af1lM3vR8RRLK8+Np0H/HG9PXN9NPi1j6z+zCz+do+IeA7wUeAXMnPgmcnLpY+Z+UxmvpxqVHw6cGiKfR8iEfEm4IHMvG3abZmw12XmK6iePPquiHh988Nl8Ps6TzW9+DuZ+f3AY1SXvRctgz4CUK83eQvwJ8OfzUofDf72xilFXKJ/iIgTAOqfD9T799ffmf7vEBGrqUL/jzLzT+vdy6qPTZn5j8CnqS57HxtVCWwYbPNif2L8EtnT9lrgLRFxL3At1eX+32J59ZHM3F3/fAC4geqPuOX0+7oL2JWZt9Tvr6f6Q2A59bHvbOBzmfkP9fuZ66PB395yLRvcLJ98AdW8eH//O+oVqK8GHqkvW90InBkRz6tXqZ5Z75u6iAiqqpB3Z+ZvNj5aNn0EiIh1EXFsvX0E1TqGu6n+ADinPmy4n21KZE9dZl6SmeszcwPV/9Zuzsy3sYz6GBFHRcRz+9tUv2d3sox+XzPzG8B9EfHietcZVJVZl00fG85n6TI/zGIfp70IosQX1WrMv6eaT/2VabenQ/v/GLgfeJrqL/ELqeZB/xL4MnATcFx9bABX1H29A9jUOM9/ABbq109Ou1+Ndr2O6nLaF4Hb69cPL6c+1m37F8Dn637eCVxa7/9uqlBboLrceFi9//D6/UL9+Xc3zvUrdf93AmdPu2/76e+/ZmlV/7LpY92XL9SvHf3/n7IMf19fDmyvf18/RrVifbn18SiqK0zHNPbNXB+t3CdJ0gripX5JklYQg1+SpBXE4JckaQUx+CVJWkEMfkmSVhCDX1InEfHz9VPW/mjabZE0Pm/nk9RJRHwJ+KHM3NXYN59LNfQlzSBH/JJai4jfpSo884mIeCQiromIzwDX1BUFPxoRt9av19bfWRsRn4yIHfVDWr4WEcdPtSPSCuSIX1Indf38TcDFwJupHjTzeER8GPjtzPzbiDiZ6pG53xMR7wcezMzLIuJHqB6xuy4zH5xWH6SVaP7Ah0jSAW3NzMfr7R8CNlaPTADg6PpJia8H/h1AZv5FRDx86JspyeCXdDA81tieA16dmU80D2j8ISBpipzjl3SwfRL4j/03EfHyevNvgB+v951N9ZAWSYeYwS/pYPt5YFNEfDEi7gJ+tt7/68DrI2IH1SX//zetBkormYv7JE1Ff3Ggi/ukQ8sRvyRJK4gjfkmSVhBH/JIkrSAGvyRJK4jBL0nSCmLwS5K0ghj8kiStIAa/JEkryP8HhGrMaid2NSQAAAAASUVORK5CYII=\n",
            "text/plain": [
              "<Figure size 576x432 with 1 Axes>"
            ]
          },
          "metadata": {
            "tags": [],
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "POufJw3qJS-T",
        "outputId": "22cf5b8f-8f19-4573-f7eb-da8eed572ab8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 669
        }
      },
      "source": [
        "dict_corpus_df.sort_values('freq', ascending=False)[:20]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>words</th>\n",
              "      <th>freq</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>4871</th>\n",
              "      <td>business</td>\n",
              "      <td>7028</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>40059</th>\n",
              "      <td>work</td>\n",
              "      <td>5950</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>21968</th>\n",
              "      <td>management</td>\n",
              "      <td>5838</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>28644</th>\n",
              "      <td>product</td>\n",
              "      <td>5340</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>35971</th>\n",
              "      <td>team</td>\n",
              "      <td>3588</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>22228</th>\n",
              "      <td>marketing</td>\n",
              "      <td>3562</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>40423</th>\n",
              "      <td>year</td>\n",
              "      <td>3446</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>34832</th>\n",
              "      <td>strategy</td>\n",
              "      <td>2787</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>20675</th>\n",
              "      <td>lead</td>\n",
              "      <td>2527</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4776</th>\n",
              "      <td>build</td>\n",
              "      <td>2525</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9591</th>\n",
              "      <td>design</td>\n",
              "      <td>2293</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>31619</th>\n",
              "      <td>sale</td>\n",
              "      <td>2208</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24797</th>\n",
              "      <td>new</td>\n",
              "      <td>2034</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>28688</th>\n",
              "      <td>professional</td>\n",
              "      <td>2020</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>32584</th>\n",
              "      <td>service</td>\n",
              "      <td>2012</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8751</th>\n",
              "      <td>customer</td>\n",
              "      <td>1977</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16193</th>\n",
              "      <td>help</td>\n",
              "      <td>1963</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>34503</th>\n",
              "      <td>startup</td>\n",
              "      <td>1934</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9980</th>\n",
              "      <td>digital</td>\n",
              "      <td>1895</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>17880</th>\n",
              "      <td>industry</td>\n",
              "      <td>1780</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "              words  freq\n",
              "4871       business  7028\n",
              "40059          work  5950\n",
              "21968    management  5838\n",
              "28644       product  5340\n",
              "35971          team  3588\n",
              "22228     marketing  3562\n",
              "40423          year  3446\n",
              "34832      strategy  2787\n",
              "20675          lead  2527\n",
              "4776          build  2525\n",
              "9591         design  2293\n",
              "31619          sale  2208\n",
              "24797           new  2034\n",
              "28688  professional  2020\n",
              "32584       service  2012\n",
              "8751       customer  1977\n",
              "16193          help  1963\n",
              "34503       startup  1934\n",
              "9980        digital  1895\n",
              "17880      industry  1780"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 64
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "g7VCLM5XKnlA",
        "outputId": "49cabfd8-dcc0-40f3-8d6c-4d8f91587c52",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "tot_words_freq_one = str(len(dict_corpus_df[dict_corpus_df['freq']==1]))\n",
        "tot_words_freq_two = str(len(dict_corpus_df[dict_corpus_df['freq']==2]))\n",
        "print(f'total words with frequency of 1: {tot_words_freq_one}')\n",
        "print(f'total words with frequency of 2: {tot_words_freq_two}')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "total words with frequency of 1: 17310\n",
            "total words with frequency of 2: 12158\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lW7XYD2AN3oX"
      },
      "source": [
        "# **Removing low frequency words from dictionary**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ltvlqS0BO3Mz",
        "outputId": "996aaaf5-363e-4a19-deef-b5b26e32ae49",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 669
        }
      },
      "source": [
        "low_freq_words = dict_corpus_df[dict_corpus_df['freq']<3]\n",
        "low_freq_words = low_freq_words.reset_index(drop=True)\n",
        "low_freq_words[:20]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>words</th>\n",
              "      <th>freq</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>aaa</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>aaaif</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>aaaim</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>aaas</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>aadab</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>aadil</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>aaf</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>aagnya</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>aaifr</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>aaj</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>aakash</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>aakrit</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>12</th>\n",
              "      <td>aalami</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>13</th>\n",
              "      <td>aalto</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>aama</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>15</th>\n",
              "      <td>aami</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16</th>\n",
              "      <td>aamster</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>17</th>\n",
              "      <td>aarp</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>18</th>\n",
              "      <td>aarron</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>19</th>\n",
              "      <td>aarrr</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "      words  freq\n",
              "0       aaa     2\n",
              "1     aaaif     1\n",
              "2     aaaim     1\n",
              "3      aaas     1\n",
              "4     aadab     1\n",
              "5     aadil     1\n",
              "6       aaf     1\n",
              "7    aagnya     1\n",
              "8     aaifr     1\n",
              "9       aaj     1\n",
              "10   aakash     1\n",
              "11   aakrit     1\n",
              "12   aalami     1\n",
              "13    aalto     1\n",
              "14     aama     1\n",
              "15     aami     1\n",
              "16  aamster     1\n",
              "17     aarp     1\n",
              "18   aarron     1\n",
              "19    aarrr     1"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 66
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "M83DvpEwOwlu"
      },
      "source": [
        "ids=[id2word.token2id[low_freq_words['words'][i]] for i in range(len(low_freq_words))]\n",
        "id2word.filter_tokens(bad_ids=ids)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "e8YRiEGpQYcn",
        "outputId": "2a20c450-9644-47ad-d548-0159905319f8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "print(f'New vocabulary size: {len(id2word)}')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "New vocabulary size: 11621\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Wa5WmzNDQwvE"
      },
      "source": [
        "# **Filter extreme words**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7cUCPzWoQ4ia",
        "outputId": "3b9819c8-55a8-4e87-dfb8-fe8e3751b478",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "# Filter out words that occur in less than 10 documents, or more than\n",
        "# 95% of the documents.\n",
        "id2word.filter_extremes(no_below=10, no_above=0.90)\n",
        "print('Total Vocabulary Size:', len(id2word))\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Total Vocabulary Size: 3775\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NpsTxBG1RJwL"
      },
      "source": [
        "# Creating Last version of **Corpus**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kwFfr39FR0P-",
        "outputId": "53fd0103-489a-4600-e6ba-2d78c9c6fe16",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "print(id2word)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Dictionary(3775 unique tokens: ['ability', 'advertising', 'analytics', 'area', 'base']...)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "816QNKeXTHW0"
      },
      "source": [
        "corpus = [id2word.doc2bow(text) for text in texts]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ek0jovtsn7Mv"
      },
      "source": [
        "# **Save Corpus in .txt format using Pickle**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZoTZ8BR7oXGI"
      },
      "source": [
        "import pickle\n",
        "\n",
        "#Saving Corpus \n",
        "with open(\"/content/drive/My Drive/NLP/with_swaraj/Data/lda_corpus_bigram_threshold_70.txt\", \"wb\") as fp:   #Pickling\n",
        "  pickle.dump(corpus, fp)\n",
        "\n",
        "\n",
        "#saving word dictionary (id2word)\n",
        "with open(\"/content/drive/My Drive/NLP/with_swaraj/Data/lda_id2word_bigram_threshold_70.txt\", \"wb\") as fp:   #Pickling\n",
        "  pickle.dump(id2word, fp)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rfsEeKKAqG5A"
      },
      "source": [
        "# **Load File**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ePGVt-6VpbIY"
      },
      "source": [
        "with open(\"/content/drive/My Drive/NLP/with_swaraj/Data/lda_corpus_bigram_threshold_70.txt\", \"rb\") as fp:   # Unpickling\n",
        "  b = pickle.load(fp)"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}